

***SET THE DIRECToRY to Replication_package
clear
global rootpath "D:\Replication_package"

********************************************************************************
//Here I'm assembling the data used for estimation. 

use "$rootpath/processed_data/Temp_Reg_FileV4.dta", clear
sort facility_id 
rename facility_id eis_id
merge m:1 eis_id using "$rootpath/processed_data/eis_id_to_frsnumber1.dta"
drop if _merge!=3
drop _merge
drop fips_code
rename eis_id facility_id
rename fips fips_code
sort fips_code
merge m:1 fips_code using "$rootpath/processed_data/fips_GDP.dta"
keep if _merge==3
drop _merge
gen multiplier=.
//Note Y16 is 2008. 
foreach num of numlist 16/27 {
gen year_temp=1992 + `num'
replace multiplier=Y`num'/Y22 if year==year_temp
drop year_temp
}

replace tot_damage=tot_damage*multiplier
replace log_damage=asinh(tot_damage)
xtset facility_id year

sort facility_id year
egen min_year=min(year), by(facility_id)
egen max_year=max(year), by(facility_id)
gen reg_nei=0
replace reg_nei=tot_damage if year==2011|year==2014|year==2017
egen max_reg_nei=max(reg_nei), by(facility_id)

gen off_nei=0
replace off_nei=tot_damage if year!=2011&year!=2014&year!=2017&year!=2008
egen max_off_nei=max(off_nei), by(facility_id)

gen zero=0
replace zero=1 if tot_damage==0
sort facility_id year
egen tot_zero=sum(zero), by(facility_id)

gen log_damage3=asinh(tot_damage/1000)
gen log_damage2=asinh(tot_damage/1000000)
gen log_damage4=asinh(tot_damage/10000)
gen log_damage5=asinh(tot_damage/100000)
gen log_damage55=asinh(tot_damage/(1.47*100000))
gen log_damage22=asinh(tot_damage/(1.47*1000000))

gen vintage=0
replace vintage=1 if year<2011
replace vintage=2 if year>2010&year<2014
replace vintage=3 if year>2013&year<2017
replace vintage=4 if year>2016
egen max_vintage=max(vintage), by(facility_id)
egen min_vintage=min(vintage), by(facility_id)

sort facility_id year
gen log_emissions3=asinh((NH3 + PM25PRI + NOX + SO2 + VOC)*(1/1000))
gen log_emissions4=asinh((NH3 + PM25PRI + NOX + SO2 + VOC)*(1/10))
gen tot_emissions=(NH3 + PM25PRI + NOX + SO2 + VOC)
egen variety=nvals(tot_damage), by(facility_id)
egen variety_pol=nvals(log_emissions), by(facility_id)
egen max_damage=max(tot_damage), by(facility_id)
gen bad_obs=0
replace bad_obs=1 if tot_emissions>=10000
egen max_bad_obs=max(bad_obs), by(facility_id)
//WORKS NO 4 facility_name, abibow. Dave Johnston
gen bad_obs2=0
replace bad_obs2=1 if tot_emissions>=5000
egen max_bad_obs2=max(bad_obs2), by(facility_id)

gen bad_obs3=0
replace bad_obs3=1 if tot_damage>1e+09
egen max_bad_obs3=max(bad_obs3), by(facility_id)

gen n4=floor(naics/100)
gen n5=floor(naics/10)
gen lead_1=1
gen lead_2=0
replace lead_2=avg_old_code_prob if year==2013
gen lead_3=0
replace lead_3=avg_old_code_prob if year==2012
gen lead_4=0
replace lead_4=avg_old_code_prob if year==2011
gen lead_5=0
replace lead_5=avg_old_code_prob if year==2010
gen lag_1=0
replace lag_1=avg_old_code_prob if year==2015
gen lag_2=0
replace lag_2=avg_old_code_prob if year==2016
gen lag_3=0
replace lag_3=avg_old_code_prob if year==2017
gen lag_4=0
replace lag_4=avg_old_code_prob if year==2018
gen lag_5=0
replace lag_5=avg_old_code_prob if year==2019

set emptycells drop
set matsize 5000

reghdfe  log_damage55 treatXpost if year>2009&max_damage>0&max_bad_obs2==0&min_year<2015&max_year>2014, absorb(facility_id n5#year) vce(cluster state_number)
gen key_estimation_sample=e(sample)
  egen state_number2=mode(state_number), by(facility_id)
egen state_number3=min(state_number), by(facility_id)
replace state_number2=state_number3 if state_number2==.
replace state_number=state_number2
drop state_number2 state_number3
save "$rootpath/processed_data/Estimation_Data.dta", replace

use "$rootpath/processed_data/Temp_Reg_FileV5.dta", clear

log using "$rootpath/log/Main_Results.log", replace
reghdfe  log_emissions4 treatXpost if key_estimation_sample==1, absorb(facility_id n5#year) vce(cluster facility_id)
reghdfe  log_damage55 treatXpost if key_estimation_sample==1, absorb(facility_id n5#year) vce(cluster facility_id)
mkspline2 dose = treatXpost, cubic knots(0 0.15 1)
reghdfe  log_emissions4 dose* if key_estimation_sample==1, absorb(facility_id n5#year) vce(cluster state_number)
mfxrcspline, link(identity) level(95) generate(test lb ub)
gen ci_size=ub - lb
summarize test lb ub ci_size if post==1
set seed 99164 //WSU's zip code. 
bootstrap mean=r(mean), cluster(facility_id) idcluster(testgroup) group(facility_id) reps(200): summarize test if post==1
drop dose* test lb ub ci_size

mkspline2 dose = treatXpost, cubic knots(0 0.15 1)
reghdfe  log_damage55 dose* if key_estimation_sample==1, absorb(facility_id n5#year) vce(cluster state_number)
mfxrcspline, link(identity) level(95) generate(test lb ub)
gen ci_size=ub - lb
summarize test lb ub ci_size if post==1
set seed 99164 //WSU's zip code. 
bootstrap mean=r(mean), cluster(facility_id) idcluster(testgroup) group(facility_id) reps(200): summarize test if post==1
drop dose* test lb ub ci_size

gen n4_year=10000*n5 + year
sort facility_id year
itercenter log_emissions4 log_damage55 treatXpost, fe(facility_id n4_year) mean xfe(d)
sort facility_id year
keep facility_id year d_log_emissions4 d_log_damage55 d_treatXpost
save "$rootpath/processed_data/Temp_MergeAPR24.dta", replace

use "$rootpath/processed_data/Estimation_Data.dta", clear
sort facility_id year
merge 1:1 facility_id year using "$rootpath/processed_data/Temp_MergeAPR24.dta"
drop _merge
save "$rootpath/processed_data/Estimation_Data.dta", replace

use "$rootpath/processed_data/Estimation_Data.dta", clear
sort frsnumber
merge m:1 frsnumber using "$rootpath/processed_data/DUNS_merged_APR24.dta"
drop if _merge==2
gen no_parent_match=0
replace no_parent_match=1 if _merge==1
drop _merge
sort facility_id year
save "$rootpath/processed_data/Estimation_Data.dta", replace

use "$rootpath/processed_data/Estimation_Data.dta", clear
sort state_number
merge m:1 state_number using "$rootpath/processed_data/state_hpv_count.dta"
drop if _merge!=3
drop _merge
sort frsnumber
merge m:1 frsnumber using "$rootpath/processed_data/wl_map_nodups.dta"
gen watch_list=0
replace watch_list=1 if _merge==3
drop if _merge==2
drop _merge
sort facility_id year
save "$rootpath/processed_data/Estimation_Data.dta", replace


use "$rootpath/processed_data/Estimation_Data.dta", clear
sort state_number
merge m:1 state_number using "$rootpath/processed_data/state_delisted_hpv_count.dta"
drop if _merge==2
replace state_delisted_hpvs=0 if _merge==1
drop _merge
merge m:1 state_number using "$rootpath/processed_data/state_resource_mapping_ALT.dta"
drop if _merge==2
replace avg_old_code_prob2 = 0 if _merge==1
drop _merge
sort facility_id year
save "$rootpath/processed_data/Estimation_Data.dta", replace

eststo clear
use "$rootpath/processed_data/working_file_sep2023.dta", clear
drop avg_old_code_prob treatXpost treat
sort state

merge m:1 state using "$rootpath/processed_data/state_resource_mappingV21.dta"
drop if _merge==2
drop _merge
gen treat=avg_old_code_prob
gen treatXpost=treat*post
rename fips fips_code
sort fips_code
merge m:1 fips_code using "$rootpath/processed_data/fips_GDP.dta"
keep if _merge==3
drop _merge
gen multiplier=.
//Note Y16 is 2008. 
foreach num of numlist 16/27 {
gen year_temp=1992 + `num'
replace multiplier=Y`num'/Y22 if year==year_temp
drop year_temp
}
drop log_damage2 log_crit_emissions
replace tot_damage2=tot_damage2*multiplier
gen log_damage2=asinh(tot_damage2/1000)
gen log_crit_emissions=asinh(tot_crit_emissions)
gen n5=floor(naics/10)
gen n2=floor(naics/10000)
keep if year>2009&pm25!=.
save "$rootpath/processed_data/TRI_estimation_data.dta", replace
//This is to clean the TRI data further.
use "$rootpath/processed_data/TRI_estimation_data.dta", clear
keep frsnumber year log_damage2 treatXpost n5 log_crit_emissions afs_id plant_id naics avg_old_code_prob
save "$rootpath/processed_data/TRI_estimation_data.dta", replace

//We are dropping unnecessary variables to reduce the file size of the replication package


use "$rootpath/processed_data/Estimation_Data.dta", clear
///This is NO2 and SO2 only
gen tot_damage_WINOXSOX = (0*nh3*NH3 + 0*PM25PRI*pm25 + nox*NOX + so2*SO2 + 0*VOC*voc)*1.47
replace tot_damage_WINOXSOX=tot_damage_WINOXSOX*multiplier
gen log_tot_damage_WINOXSOX=asinh(tot_damage_WINOXSOX/(1.47*100000))
gen log_emis_WINOXSOX=asinh((0*NH3 + 0*PM25PRI + NOX + SO2 + 0*VOC)*(1/10))

///This is without SO2 and NO2
gen tot_damage_WONOXSOX = (nh3*NH3 + PM25PRI*pm25 + 0*nox*NOX + 0*so2*SO2 + VOC*voc)*1.47
replace tot_damage_WONOXSOX=tot_damage_WONOXSOX*multiplier
gen log_tot_damage_WONOXSOX=asinh(tot_damage_WONOXSOX/(1.47*100000))
gen log_emis_WONOXSOX=asinh((NH3 + PM25PRI + 0*NOX + 0*SO2 + VOC)*(1/10))
save "$rootpath/processed_data/Estimation_Data.dta", replace


//Compliance data. 
use "$rootpath/processed_data/final_annual_plant_file.dta", clear
keep frsnumber year air_operating_status_code epa_region quarterly_penaltiesnofed annual_quarterly_tests2 quarterly_full_inspectionsnofed quarterly_part_inspectionsnofed annual_full_inspec annual_part_inspec annual_quarterly_penalties2 annual_tot_hpv2 annual_tot_frv2
sort frsnumber year
save "$rootpath/processed_data/estimation_ICIS_plant_file.dta", replace


use "$rootpath/processed_data/Estimation_Data.dta", clear
drop if key_estimation_sample!=1
set seed 99164
sort frsnumber year, stable
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n)
gen duplicated=0
replace duplicated=1 if dup>1
gsort -duplicated
gen test_n=_n
summarize frsnumber
replace frsnumber=r(max)*10000+test_n if dup>1
drop dup
sort frsnumber year, stable
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
merge 1:1 frsnumber year using "$rootpath/processed_data/estimation_ICIS_plant_file.dta"

drop if _merge==2
drop _merge
foreach var in quarterly_penaltiesnofed annual_quarterly_tests2 quarterly_full_inspectionsnofed quarterly_part_inspectionsnofed annual_full_inspec annual_part_inspec annual_quarterly_penalties2 annual_tot_hpv2 annual_tot_frv2 {
replace `var'=0 if `var'==.
}

gen region_year=epa_region*10000+year
gen naics_year=naics*10000+year
egen years_observed=nvals(year), by(frsnumber)
gen fined=0
replace fined=1 if annual_quarterly_penalties2>0


gen insp_rate=(annual_full_inspec+0*annual_quarterly_tests2+annual_part_inspec)/annual_tot_hpv2
gen insp_rate22=(annual_full_inspec+0*annual_quarterly_tests2+annual_part_inspec)/annual_tot_frv2

gen state_insp_rate=(quarterly_full_inspectionsnofed+quarterly_part_inspectionsnofed)/annual_tot_hpv2
gen state_insp_rate22=(quarterly_full_inspectionsnofed+quarterly_part_inspectionsnofed)/annual_tot_frv2

gen fine_rate=annual_quarterly_penalties2/(annual_tot_hpv2)
gen fine_rate22=annual_quarterly_penalties2/annual_tot_frv2

gen state_fine_rate=quarterly_penaltiesnofed/(annual_tot_hpv2)
gen state_fine_rate22=quarterly_penaltiesnofed/(annual_tot_frv2)
reghdfe insp_rate treatXpost if air_operating_status_code=="OPR"&years_observed==10, absorb(frsnumber  epa_region#year naics#c.year) vce(cluster frsnumber)
summarize insp_rate if e(sample)==1
savesome frsnumber year epa_region region_year naics_year naics state_fine_rate state_insp_rate fine_rate insp_rate treatXpost state_number if e(sample)==1 using "$rootpath/processed_data/Estimation_HPV_Subset.dta", replace
reghdfe insp_rate22 treatXpost if air_operating_status_code=="OPR"&years_observed==10&annual_tot_hpv2==0, absorb(frsnumber  epa_region#year naics#c.year) vce(cluster frsnumber)
summarize insp_rate if e(sample)==1
savesome frsnumber year epa_region region_year naics_year naics state_fine_rate22 state_insp_rate22 fine_rate22 insp_rate22 treatXpost state_number if e(sample)==1 using "$rootpath/processed_data/Estimation_FRV_Subset.dta", replace



use "$rootpath/processed_data/Estimation_HPV_Subset.dta", clear
sort frsnumber year
itercenter state_fine_rate state_insp_rate fine_rate insp_rate treatXpost, fe(frsnumber region_year naics_year) mean xfe(d)
save "$rootpath/processed_data/Estimation_HPV_Subset.dta", replace

use "$rootpath/processed_data/Estimation_FRV_Subset.dta", clear
sort frsnumber year
itercenter state_fine_rate22 state_insp_rate22 fine_rate22 insp_rate22 treatXpost, fe(frsnumber region_year naics_year) mean xfe(d)
save "$rootpath/processed_data/Estimation_FRV_Subset.dta", replace



eststo clear
clear
import delimited "$rootpath/raw_data/SBudget_Data_Raw.csv", encoding(UTF-8) 
sort year
merge m:1 year using "$rootpath/processed_data/Inflation_Data_Mar20.dta"
drop _merge
destring total, gen(tot_budget) force
replace tot_budget=(1.06*tot_budget)/(inflation_2014)
sort state 
merge m:1 state using "$rootpath/processed_data/state_resource_mappingV21.dta"
drop if _merge==2
drop _merge

gen post=0
replace post=1 if year>2014
gen treat=0
replace treat=avg_old_code_prob if avg_old_code_prob!=.
gen treatXpost=treat*post
encode state, gen(sid)
xtset sid year
reg tot_budget treatXpost i.sid i.year if year>2009, vce(cluster state)
xtreg tot_budget treatXpost i.year if year>2009, fe vce(cluster state)
gen log_budget=log(tot_budget+1)
reg log_budget treatXpost i.sid i.year if year>2009, vce(cluster state)
xtreg log_budget treatXpost i.year if year>2009, fe vce(cluster state)

//Code below produces the parallel trends for budget and treatXpost
gen treat1=.
replace treat1=1 if avg_old_code_prob>0.53
replace treat1=0 if avg_old_code_prob<0.23
drop if state=="AK"|state=="HI"
sort sid year
xtset sid year
gen log_budget1=.
replace log_budget1=tot_budget if treat1==1
egen temp1=mean(log_budget1), by(year)
gen avg_log_budget1=log(temp1)

gen log_budget0=.
replace log_budget0 = tot_budget if treat1==0
egen temp0=mean(log_budget0), by(year)
gen avg_log_budget0=log(temp0)
egen log_budget2=mean(tot_budget), by(year)
gen avg_log_budget=log(log_budget2)

sort year
quietly by year:  gen dup_year= cond(_N==1,0,_n)
keep if dup_year==1&year>2009
label var avg_log_budget "All States"
label var avg_log_budget1 "High Treatment State"
label var avg_log_budget0 "Low Treatment State"
keep avg_log_budget avg_log_budget1 avg_log_budget0 year
save "$rootpath/processed_data/Fig3_Data.dta", replace


use "$rootpath/processed_data/Estimation_Data.dta", clear
sort frsnumber year
merge m:1 frsnumber year using "$rootpath/processed_data/transfered_plant_level.dta"
drop if _merge==2
rename _merge _merge_other
sort frsnumber year
merge m:1 frsnumber year using "$rootpath/processed_data/transfered_plant_history.dta"
drop if _merge==2
drop _merge
replace max_frv_year=0 if max_frv_year==.
replace max_hpv_year=0 if max_hpv_year==.
replace max_hpv_year=1 if _merge_other==3
save "$rootpath/processed_data/Estimation_Data.dta", replace


use "$rootpath/processed_data/Estimation_Data.dta", clear
drop if key_estimation_sample==0
sort facility_id year
twowayfeweights log_damage55 facility_id year treatXpost, type(feTR) summary_measures path("$rootpath/processed_data/test_weights.dta")
  
use "$rootpath/processed_data/test_weights.dta", clear 
egen testval=nvals(Group_TWFE)
summarize testval
summarize Group_TWFE, detail

total weight
summarize weight
gen test101=r(N)
gen test100=r(min)
///So right here what we are doing is moving the weights to the right and rescaling so they 
//add up to 1. 
gen new_weightX=(weight-test100)*(1/(1 - test100*r(N)))
sort Group_TWFE
//Next, we flip it where the observations with the most weight get less. 
total new_weightX
gen inv_temp_new_weight2=(1/new_weightX)
summarize inv_temp_new_weight2
gen constantX=1/(r(mean)*r(N))
gen new_weight_fin=constantX/new_weightX
total new_weight_fin
sort Group_TWFE
egen new_weight=mean(new_weight_fin), by(Group_TWFE)
sort Group_TWFE
quietly by Group_TWFE:  gen dup = cond(_N==1,0,_n)
drop if dup>1
//rename new_weight_fin new_weight
rename Group_TWFE facility_id 
rename Time_TWFE year
keep facility_id new_weight
save "$rootpath/processed_data/new_weights.dta", replace
 
 
use "$rootpath/processed_data/Estimation_Data.dta", clear
sort facility_id
merge m:1 facility_id using "$rootpath/processed_data/new_weights.dta"
drop if _merge==2

summarize new_weight if _merge==3
gen test100=1-(r(mean)*r(N))
summarize facility_id if post>0&_merge!=3
replace new_weight=new_weight*(1+(test100)) if _merge==3
total new_weight

summarize new_weight
replace new_weight=r(mean) if new_weight==.

sort facility_id year
egen new_weight_mean=mean(new_weight), by(facility_id)

summarize new_weight
replace new_weight=new_weight*(1/(r(mean)*r(N)))
drop _merge
save "$rootpath/processed_data/Estimation_Data.dta", replace

//This is to clean the estimation data further. 
use "$rootpath/processed_data/Estimation_Data.dta", clear
//We are dropping unnecessary variables to reduce the file size of the replication package
drop facility_name zipcode NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead fipscode
drop y_* dup_fips nh3 pm25 so2 voc yfe_* primary_name county_name state_name Y16 Y17 
drop Y18 Y19 Y20 Y21 Y22 Y23 Y24 Y25 Y26 Y27 multiplier log_damage3 log_damage2 log_damage4 log_damage5
drop log_emissions3 ORG_NAME
save "$rootpath/processed_data/Estimation_Data.dta", replace



///This is to create the table 3 data
use "$rootpath/processed_data/SRF_Results.dta", clear
sort state
merge 1:1 state using "$rootpath/processed_data/state_resource_mappingV21.dta"
sort state
keep state penaltyproblem inspectionproblem  violationproblem enforcementproblem avg_old_code_prob
save "$rootpath/processed_data/TAB3_Estimation_Data.dta", replace


use "$rootpath/processed_data/Estimation_Data.dta", clear
keep if key_estimation_sample==1
gen n2=floor(naics/10000)
gen manufact=0
replace manufact=1 if (n2==31&n2!=.)|(n2==32&n2!=.)|(n2==33&n2!=.)
sort state
egen avg_manufact=mean(manufact), by(state)
sort state
quietly by state:  gen dups = cond(_N==1,0,_n)
drop if dups>1
keep state avg_manufact
merge 1:1 state using "$rootpath/processed_data/TAB3_Estimation_Data.dta"
drop if _merge==2
drop _merge
drop if state=="HI"|state=="AK"
save "$rootpath/processed_data/TAB3_Estimation_Data.dta", replace


clear
import delimited "$rootpath/raw_data/SBudget_Data_Raw.csv", encoding(UTF-8) 
sort year
merge m:1 year using "$rootpath/processed_data/Inflation_Data_Mar20.dta"
drop _merge
destring total, gen(tot_budget) force
replace tot_budget=(1.02*tot_budget)/(inflation_2017)
keep if state!="AK"&state!="HI"
gen log_budget=log(tot_budget+1)

egen avg_budget = mean(tot_budget), by( state)
gen avg_log_budget = log(avg_budget +1)
keep if year == 2010
keep state avg_log_budget
sort state
merge 1:1 state using "$rootpath/processed_data/TAB3_Estimation_Data.dta"
drop if _merge==2
drop _merge
sort state
save "$rootpath/processed_data/TAB3_Estimation_Data.dta", replace

///This is to assemble the ML data
///First we need to define a dataset for each GC and matrix criteria. 
use "$rootpath/processed_data/kept_actionsV21.dta", clear
sort afs_id
merge m:1 afs_id using "$rootpath/processed_data/FRS_AFS_LINK.dta"
keep if _merge==3
drop _merge
gen year=floor(date_achieved/10000)
tab year
sort frsnumber year
gen GC10=0
replace GC10=1 if regexm(all_violation_type_codes, "G10")==1

foreach num of numlist 1/9 {
gen GC`num'=0
replace GC`num'=1 if regexm(all_violation_type_codes, "GC`num'")==1
}

foreach num of numlist 1/4 {
gen MC`num'=0
replace MC`num'=1 if regexm(all_violation_type_codes, "M`num'")==1
}
 
 
foreach num of numlist 1/10 {
sort frsnumber year
egen G`num' = max(GC`num'), by(frsnumber year)
}
 
foreach num of numlist 1/4 {
egen M`num' = max(MC`num'), by(frsnumber year)
}
 
keep frsnumber year M1 M2 M3 M4 G1 G2 G3 G4 G5 G6 G7 G8 G9 G10
sort frsnumber year
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n) 
drop if dup>1
drop dup
sort frsnumber year
save "$rootpath/processed_data/transfered_plant_levelML.dta", replace


use "$rootpath/processed_data/Temp_Reg_FileV4.dta", clear
sort facility_id 
rename facility_id eis_id
merge m:1 eis_id using "$rootpath/processed_data/eis_id_to_frsnumber1.dta"
drop if _merge!=3
drop _merge
gen tot_emissions=(NH3 + PM25PRI + NOX + SO2 + VOC)
keep log_damage tot_damage tot_emissions frsnumber year
replace log_damage=asinh(tot_damage/(1.47*100000))
replace tot_damage=tot_damage/1.47
sort frsnumber year
merge m:1 frsnumber year using "$rootpath/processed_data/transfered_plant_levelML.dta"
keep if _merge==3
keep if year>2009&year<2015
drop _merge
sort frsnumber year
saveold "$rootpath/processed_data/ML_dataAPR24.dta", replace v(12)


use "$rootpath/processed_data/ML_dataAPR24.dta", clear
vselect log_damage M1 M2 M3 M4 G1 G2 G3 G4 G5 G6 G7 G8 G9 G10, best


